library(MplusAutomation)
library(tidyverse)
library(here)
library(gt)
# sankey / alluvial packages
library(networkD3)
library(parcats)
library(easyalluvial)
library(ggalluvial)Testing Sankey & Alluvial Daigram Variants
LTA context issues:
- The parameters typically used is in LTA plots are the transition probabilities (think the stratum b/w categorical variables) & class sizes (proportions in each category).
- Longitudinal model: 1st nominal variable (left) is time-1 with 4-classes and 2nd variable (right) is for time-2 with 4-classes
- Missing from these plots is the time-1 class sizes (currently each class is incorrectly 25%). I am not sure how to incorporate the true class size information.
Read invariance model and extract parameters (intercepts and multinomial regression coefficients)
## <simpleError in file(file, "rt"): cannot open the connection>
par <- as_tibble(lta_inv1[["parameters"]][["unstandardized"]]) %>%
select(1:3) %>%
filter(grepl('ON|Means', paramHeader)) %>%
mutate(est = as.numeric(est))Manual method to calculate transition probabilities
# Name each parameter individually to make the subsequent calculations more readable
a1 <- unlist(par[13,3]); a2 <- unlist(par[14,3]); a3 <- unlist(par[15,3]); b11 <- unlist(par[1,3]);
b21 <- unlist(par[4,3]); b31 <- unlist(par[7,3]); b12 <- unlist(par[2,3]); b22 <- unlist(par[5,3]);
b32 <- unlist(par[8,3]); b13 <- unlist(par[3,3]); b23 <- unlist(par[6,3]); b33 <- unlist(par[9,3])
# Calculate transition probabilities from the logit parameters
t11 <- exp(a1+b11)/(exp(a1+b11)+exp(a2+b21)+exp(a3+b31)+exp(0))
t12 <- exp(a2+b21)/(exp(a1+b11)+exp(a2+b21)+exp(a3+b31)+exp(0))
t13 <- exp(a3+b31)/(exp(a1+b11)+exp(a2+b21)+exp(a3+b31)+exp(0))
t14 <- 1 - (t11 + t12 + t13)
t21 <- exp(a1+b12)/(exp(a1+b12)+exp(a2+b22)+exp(a3+b32)+exp(0))
t22 <- exp(a2+b22)/(exp(a1+b12)+exp(a2+b22)+exp(a3+b32)+exp(0))
t23 <- exp(a3+b32)/(exp(a1+b12)+exp(a2+b22)+exp(a3+b32)+exp(0))
t24 <- 1 - (t21 + t22 + t23)
t31 <- exp(a1+b13)/(exp(a1+b13)+exp(a2+b23)+exp(a3+b33)+exp(0))
t32 <- exp(a2+b23)/(exp(a1+b13)+exp(a2+b23)+exp(a3+b33)+exp(0))
t33 <- exp(a3+b33)/(exp(a1+b13)+exp(a2+b23)+exp(a3+b33)+exp(0))
t34 <- 1 - (t31 + t32 + t33)
t41 <- exp(a1)/(exp(a1)+exp(a2)+exp(a3)+exp(0))
t42 <- exp(a2)/(exp(a1)+exp(a2)+exp(a3)+exp(0))
t43 <- exp(a3)/(exp(a1)+exp(a2)+exp(a3)+exp(0))
t44 <- 1 - (t41 + t42 + t43)Create transition table
t_matrix <- tibble(
"Time1" = c("C1=1","C1=2","C1=3","C1=4"),
"C2=1" = c(t11,t21,t31,t41),
"C2=2" = c(t12,t22,t32,t42),
"C2=3" = c(t13,t23,t33,t43),
"C2=4" = c(t14,t24,t34,t44))
t_matrix %>%
gt(rowname_col = "Time1") %>%
tab_header(
title = md("**Student transitions from 7th grade (rows) to 10th grade (columns)**"),
subtitle = md(" ")) %>%
fmt_number(2:5,decimals = 2) %>%
tab_spanner(label = "10th grade",columns = 2:5)| Student transitions from 7th grade (rows) to 10th grade (columns) | ||||
|---|---|---|---|---|
| Â | ||||
| 10th grade | ||||
| C2=1 | C2=2 | C2=3 | C2=4 | |
| C1=1 | 0.27 | 0.27 | 0.32 | 0.15 |
| C1=2 | 0.09 | 0.56 | 0.19 | 0.16 |
| C1=3 | 0.15 | 0.21 | 0.52 | 0.12 |
| C1=4 | 0.08 | 0.35 | 0.27 | 0.30 |
Plot LTA transitions
Type 1: LTA plot using plotLTA from the package {MplusAutomation}
- issue1: node proportions & transition proportions information is missing (these need to be labeled!)
- issue2: change to faceted plot
Type 2: Sankey interactive diagram using the package {networkD3}
- issue 1: Proportions at time 1 are incorrect. Unclear how to incorporate initial class size data into chart.
- issue 2: Unclear how to label node and transition. Examples to work from are unavailable.
Change to long-format
trans_long <- t_matrix %>%
pivot_longer(`C2=1`:`C2=4`, # The columns I'm gathering together
names_to = "Time2", # new column name for existing names
values_to = "value") # new column name to store values
nodes <- data.frame(name=c(as.character(trans_long$Time1),
as.character(trans_long$Time2)) %>%
unique())
trans_long$IDTime1=match(trans_long$Time1, nodes$name)-1
trans_long$IDTime2=match(trans_long$Time2, nodes$name)-1Prepare color scale
ColourScal ='d3.scaleOrdinal().range([
"#FDE725FF","#B4DE2CFF","#6DCD59FF","#35B779FF","#1F9E89FF",
"#26828EFF","#31688EFF","#3E4A89FF","#482878FF","#440154FF"])'It’s interactive!
Type 3: LTA alluvial plot using the package {ggalluvial}
- issue 1: Proportions at time 1 are incorrect. Unclear how to incorporate initial class size data into chart.
- issue 2: Unclear how to label node and transitions in a clear manner. Examples to work from are unavailable.
#library(gghighlight)
ggplot(trans_long,
aes(axis1 = Time1, axis2 = Time2,
y = value)) +
scale_x_discrete(limits = c("7th Grade", "10th Grade"), expand = c(.2, .05)) +
geom_alluvium(aes(fill = Time1), show.legend = FALSE) +
geom_stratum() +
#geom_text(stat = "alluvium", aes(label = round(value,2), color = Time1),
# show.legend = FALSE, fontface = 2, position = position_nudge(x = -0.21)) +
theme_minimal()Type 4: A cool interactive alluvial plot using the package {easyalluvial}
Currently this plot makes little sense, it was just the first version I could get to run. More exploration needed!
library(parcats)
library(easyalluvial)
p = alluvial_wide(t_matrix, max_variables = 5)
parcats(p, marginal_histograms = TRUE, data_input = t_matrix)Cite all packages
Second attempt: go the ggplot route (rather than rely on a package)
Things I like about this solution:
- Faceted plot: The focus in LTA is on the transitions therefore having all 16 transitions superimposed is not ideal
- Labeled transitions & nodes: This info is critical to my application
Things I DON’T like:
- I prefer the Alluvial/Sankey style of having the discrete categories represented by a bar (left/right side of plots) cut into pieces. This makes sense in the LTA because the whole sample (\(P=1\)) is proportioned into classes. The nodes don’t capture this part/whole relationship as clearly.
load the model
## <simpleError in file(file, "rt"): cannot open the connection>
This code is derived from the plotLTA function found with the source code from the MplusAutomation package
- Since the code I started with was written for a package it still contains remnants of code related to the function arguments.
- Sorry about the mess!
Source code: https://github.com/michaelhallquist/MplusAutomation/blob/995d1ecfae3656524153456ce647f86fe8c1cf1e/R/mixtures.R
Function arguments for plotLTA
# set the thickness of the node (circle outlines)
node_stroke = 2
# set the maximum thickness of the edges (lines)
max_edge_width = 2Extract edge data
# create vector with all classes
all_classes <- unique(c(edges$from, edges$to))
# create vector of latent variable names only
latent_variables <- unique(gsub("\\..+$", "", all_classes))
# create 4 new columns a start and end for each axis of plot
edges$x <- as.numeric(factor(gsub("\\..+$", "", as.character(edges$from)), levels = latent_variables))
edges$xend <-as.numeric(factor(gsub("\\..+$", "", as.character(edges$to)), levels = latent_variables))
edges$y <- as.numeric(gsub("^.+\\.", "", as.character(edges$from)))
edges$yend <- as.numeric(gsub("^.+\\.", "", as.character(edges$to)))
# transition labels:
# 1. create column of transitions in percent scale
# 2. calculate label positions = y-value of segment midpoints
edges <- edges %>%
mutate(trans_perc = round(probability*100,0)) %>%
mutate(y_mid = y + ((yend - y)/2)) Create a nodes data.frame
nodes <- rbind(edges[, c(1, 4, 6)], setNames(edges[, c(2, 5, 7)], names(edges)[c(1, 4, 6)]))
nodes <- nodes[!duplicated(nodes),]
names(nodes)[1] <- "nodeID"Create a n_prop data.frame containing class count/proportions used to set edge & node thickness.
# Extract counts and proportions
n_prop <- mplusModel$class_counts$modelEstimated %>%
mutate(percent = round(proportion*100,0))
# Convert 'proportions' column to stroke thickness values
if (!is.null(node_stroke)) {
n_prop$proportion <-
node_stroke * n_prop$proportion * inverse.rle(list(
lengths = rle(n_prop$variable)$lengths,
values = rle(n_prop$variable)$lengths
))
} else {
n_prop$proportion <- 1
}
# Add column nodeID
n_prop$nodeID <- paste(n_prop$variable, n_prop$class, sep = ".")
# merge data.frames 'nprop' and 'nodes'
nodes <- merge(nodes, n_prop, by = "nodeID")
# ------------------------------------------
# FAILED ATTEMPTS AT RE-ORDERING CLASSES
# (classes are nominal but plot dimensions are treated as continuous)
# ------------------------------------------
# re-order nominal classes based on researcher preference
# nodes <- nodes %>%
# mutate(y = case_when(
# y == 2 ~ 1,
# y == 1 ~ 2,
# y == 3 ~ 3,
# y == 4 ~ 4
# ))
# nodes <- nodes %>%
# mutate(y = factor(y,
# levels = c("2","1","3","4"),
# labels = c("C1","C2","C3","C4")))
# ------------------------------------------Set node size value
# original value from function = 3.880556
nodesize <- max(max(sapply(nodes$nodeID, nchar)) * 4.5, 6)Make the plot with ggplot in a step-wise fashion
- start with blank slate
- Create the edges with
geom_segment()
p1 <- p0 + geom_segment(
data = edges,
aes(
x = x,
y = y,
xend = xend,
yend = yend,
size = probability,
alpha = probability),
color = "black") +
scale_x_continuous(breaks = c(1,2), labels = c('1' = "7th Grade", '2' = "10th Grade")) +
scale_alpha_continuous(range = c(.05,.8))
p1This is too much going on with all 16 overlapping transitions
- Facet wrap by class + some extra formatting:
- re-label the facets manually
- re-label the x-axis
- expand grid to make room for large nodes
- reverse y-axis scale so ‘4’ is at the top
- add the transition label layer to the plot
p2 <- p1 +
facet_wrap(~y, labeller = labeller(
y = c(`1` = "Transitions from K = 1 in 7th grade", `2` = "Transitions from K = 2 in 7th grade",
`3` = "Transitions from K = 3 in 7th grade", `4` = "Transitions from K = 4 in 7th grade"))) +
scale_x_continuous(expand = c(.1, .1), breaks = c(1,2), labels = c('1' = "7th Grade", '2' = "10th Grade")) +
scale_y_reverse(expand = c(.15, .15)) +
geom_text_repel(data=edges, aes(x=xend, y=y_mid, label = glue("{trans_perc}%")),
segment.colour = NA,
nudge_x = -.5
)
p2- Add nodes (first attempt)
p3 <- p2 + geom_point(
data = nodes,
shape = 21,
size = nodesize,
colour = "black",
fill = "white",
aes(x = x, y = y, stroke = proportion)
)
p3 This is not what I want I need all 8 nodes to be printed within each facet
- Add nodes (second attempt)
To include all 8 nodes in each facet a data.frame was used nodes2 in which the faceting variable y is absent.
nodes2 <- nodes %>% rename(x1 = x, y1 = y) %>%
mutate(class = factor(class))
p4 <- p2 + geom_point(
data = nodes2,
shape = 21,
size = nodesize,
fill = "white",
aes_string(x = "x1", y = "y1", stroke = "proportion")
)
p4 - Add in node labels
p5 <- p4 +
geom_text(data = nodes2, aes(x = x1, y = y1, label = paste0(percent,"%"))) +
theme_void() +
theme(axis.text.x = element_text())
p5Save BW transition plot
Create color version of figure
pallete=pnw_palette("Bay", n=4, type = "discrete")
p6 <- p3 + geom_point(
data = nodes2,
shape = 21,
size = nodesize,
color = "black",
aes_string(x = "x1", y = "y1", stroke = "proportion", fill = "class"), show.legend = FALSE) +
scale_fill_manual("", values = rev(pallete)) +
geom_text(data = nodes2, aes(x = x1, y = y1, label = paste0(percent,"%"))) +
theme_void()
p6 Save color transition plot